{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gym\n",
    "import numpy as np\n",
    "import time\n",
    "import sys\n",
    "from matplotlib import pyplot as plt\n",
    "sys.path.append(\"/home/wzy/reinforcementLearning/project/gym-master/gym/envs/toy_text\") \n",
    "from cliffwalking import CliffWalkingEnv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "导入相关库，由于需要修改cliffwalking环境，所以需要通过sys来导入本地的cliffwalking环境"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class SarsaAgent(object):\n",
    "    def __init__(self, obs_n, act_n, learning_rate=0.01, gamma=0.9, e_greed=0.1):\n",
    "        self.act_n = act_n      # 动作维度，有几个动作可选\n",
    "        self.lr = learning_rate # 学习率\n",
    "        self.gamma = gamma      # reward的衰减率\n",
    "        self.epsilon = e_greed  # 按一定概率随机选动作\n",
    "        self.Q = np.zeros((obs_n, act_n))\n",
    "\n",
    "    # 输入观察值，采样输出的action值\n",
    "    def sample(self, observation):\n",
    "        if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据qtable中的Q值选择action\n",
    "            act = self.predict(observation)\n",
    "        else:\n",
    "            act = np.random.choice(self.act_n) #随机概率进行随机选择一个action\n",
    "        return act\n",
    "\n",
    "    # 输入观察值，进行预测，预测输出的action值\n",
    "    def predict(self, observation):\n",
    "        qlist = self.Q[observation, :]\n",
    "        maxq = np.max(qlist)\n",
    "        action_list = np.where(qlist == maxq)[0]  # 这里maxq不一定是唯一的，可能对用多个action值\n",
    "        act = np.random.choice(action_list)\n",
    "        return act\n",
    "\n",
    "    # 学习更新Q-table的方法\n",
    "    def learn(self, observation, act, reward, next_obs, next_act, done):\n",
    "        predict_Q = self.Q[observation, act]\n",
    "        if done:\n",
    "            target_Q = reward # 没有下一个状态了\n",
    "        else:\n",
    "            target_Q = reward + self.gamma * self.Q[next_obs, next_act] # Sarsa\n",
    "        self.Q[observation, act] += self.lr * (target_Q - predict_Q) # 修正q\n",
    "    \n",
    "    # 保存Q表格数据到文件\n",
    "    def save(self):\n",
    "        npy_file = './qlist.npy'\n",
    "        np.save(npy_file, self.Q)\n",
    "        print(npy_file + ' saved.')\n",
    "\n",
    "    # 从文件中读取Q值到Q表格中\n",
    "    def restore(self, npy_file='./qlist.npy'):\n",
    "        self.Q = np.load(npy_file)\n",
    "        print(npy_file + ' loaded.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "构建sarsa类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_episode(env, agent, render=False):\n",
    "    total_steps = 0 # 记录每个episode中一共走了多少step\n",
    "    total_reward = 0#记录总的reward\n",
    "    observation=env.reset() # 重置环境, 重新开一局（即开始新的一个episode）\n",
    "    act = agent.sample(observation) # 根据算法选择一个动作\n",
    "\n",
    "    while True:\n",
    "        next_obs, reward, done, _ = env.step(act) # 与环境进行一个交互\n",
    "        next_act = agent.sample(next_obs) # 根据算法选择一个动作\n",
    "        # 训练 Sarsa 算法\n",
    "        agent.learn(observation, act, reward, next_obs, next_act, done)\n",
    "\n",
    "        act = next_act\n",
    "        observation = next_obs  # 存储上一个观察值\n",
    "        total_reward += reward\n",
    "        total_steps += 1 # 计算step数\n",
    "        if render:\n",
    "            env.render() #渲染新的一帧图形\n",
    "        if done:\n",
    "            break\n",
    "    return total_reward, total_steps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def test_episode(env, agent):\n",
    "    total_reward = 0\n",
    "    observation = env.reset()\n",
    "    while True:\n",
    "        act = agent.predict(observation) # greedy\n",
    "        next_obs, reward, done, _ = env.step(act)\n",
    "        total_reward += reward\n",
    "        observation = next_obs\n",
    "        time.sleep(0.5)\n",
    "        env.render()\n",
    "        if done:\n",
    "            break\n",
    "    return total_reward"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Episode 0: steps = 904 , reward = -2587.0\n",
      "Episode 1: steps = 269 , reward = -368.0\n",
      "Episode 2: steps = 325 , reward = -424.0\n",
      "Episode 3: steps = 453 , reward = -552.0\n",
      "Episode 4: steps = 179 , reward = -179.0\n",
      "Episode 5: steps = 104 , reward = -104.0\n",
      "Episode 6: steps = 146 , reward = -245.0\n",
      "Episode 7: steps = 89 , reward = -89.0\n",
      "Episode 8: steps = 159 , reward = -159.0\n",
      "Episode 9: steps = 88 , reward = -187.0\n",
      "Episode 10: steps = 87 , reward = -285.0\n",
      "Episode 11: steps = 247 , reward = -346.0\n",
      "Episode 12: steps = 118 , reward = -118.0\n",
      "Episode 13: steps = 60 , reward = -60.0\n",
      "Episode 14: steps = 75 , reward = -174.0\n",
      "Episode 15: steps = 212 , reward = -311.0\n",
      "Episode 16: steps = 63 , reward = -63.0\n",
      "Episode 17: steps = 52 , reward = -52.0\n",
      "Episode 18: steps = 144 , reward = -144.0\n",
      "Episode 19: steps = 109 , reward = -208.0\n",
      "Episode 20: steps = 50 , reward = -50.0\n",
      "Episode 21: steps = 105 , reward = -105.0\n",
      "Episode 22: steps = 70 , reward = -70.0\n",
      "Episode 23: steps = 126 , reward = -225.0\n",
      "Episode 24: steps = 155 , reward = -254.0\n",
      "Episode 25: steps = 53 , reward = -53.0\n",
      "Episode 26: steps = 124 , reward = -124.0\n",
      "Episode 27: steps = 68 , reward = -68.0\n",
      "Episode 28: steps = 71 , reward = -71.0\n",
      "Episode 29: steps = 94 , reward = -94.0\n",
      "Episode 30: steps = 53 , reward = -53.0\n",
      "Episode 31: steps = 55 , reward = -55.0\n",
      "Episode 32: steps = 99 , reward = -99.0\n",
      "Episode 33: steps = 26 , reward = -125.0\n",
      "Episode 34: steps = 118 , reward = -217.0\n",
      "Episode 35: steps = 61 , reward = -160.0\n",
      "Episode 36: steps = 97 , reward = -295.0\n",
      "Episode 37: steps = 40 , reward = -40.0\n",
      "Episode 38: steps = 67 , reward = -67.0\n",
      "Episode 39: steps = 50 , reward = -50.0\n",
      "Episode 40: steps = 59 , reward = -59.0\n",
      "Episode 41: steps = 38 , reward = -38.0\n",
      "Episode 42: steps = 48 , reward = -48.0\n",
      "Episode 43: steps = 65 , reward = -65.0\n",
      "Episode 44: steps = 97 , reward = -97.0\n",
      "Episode 45: steps = 47 , reward = -47.0\n",
      "Episode 46: steps = 51 , reward = -51.0\n",
      "Episode 47: steps = 57 , reward = -57.0\n",
      "Episode 48: steps = 37 , reward = -37.0\n",
      "Episode 49: steps = 58 , reward = -58.0\n",
      "Episode 50: steps = 49 , reward = -148.0\n",
      "Episode 51: steps = 79 , reward = -178.0\n",
      "Episode 52: steps = 60 , reward = -60.0\n",
      "Episode 53: steps = 60 , reward = -60.0\n",
      "Episode 54: steps = 58 , reward = -58.0\n",
      "Episode 55: steps = 57 , reward = -57.0\n",
      "Episode 56: steps = 29 , reward = -29.0\n",
      "Episode 57: steps = 59 , reward = -59.0\n",
      "Episode 58: steps = 56 , reward = -56.0\n",
      "Episode 59: steps = 78 , reward = -78.0\n",
      "Episode 60: steps = 52 , reward = -52.0\n",
      "Episode 61: steps = 44 , reward = -44.0\n",
      "Episode 62: steps = 78 , reward = -177.0\n",
      "Episode 63: steps = 44 , reward = -44.0\n",
      "Episode 64: steps = 42 , reward = -42.0\n",
      "Episode 65: steps = 26 , reward = -125.0\n",
      "Episode 66: steps = 76 , reward = -274.0\n",
      "Episode 67: steps = 35 , reward = -35.0\n",
      "Episode 68: steps = 35 , reward = -35.0\n",
      "Episode 69: steps = 59 , reward = -59.0\n",
      "Episode 70: steps = 55 , reward = -253.0\n",
      "Episode 71: steps = 33 , reward = -33.0\n",
      "Episode 72: steps = 28 , reward = -127.0\n",
      "Episode 73: steps = 46 , reward = -46.0\n",
      "Episode 74: steps = 65 , reward = -263.0\n",
      "Episode 75: steps = 42 , reward = -42.0\n",
      "Episode 76: steps = 55 , reward = -55.0\n",
      "Episode 77: steps = 23 , reward = -23.0\n",
      "Episode 78: steps = 61 , reward = -61.0\n",
      "Episode 79: steps = 36 , reward = -36.0\n",
      "Episode 80: steps = 57 , reward = -57.0\n",
      "Episode 81: steps = 33 , reward = -33.0\n",
      "Episode 82: steps = 33 , reward = -33.0\n",
      "Episode 83: steps = 33 , reward = -33.0\n",
      "Episode 84: steps = 94 , reward = -193.0\n",
      "Episode 85: steps = 48 , reward = -48.0\n",
      "Episode 86: steps = 37 , reward = -136.0\n",
      "Episode 87: steps = 34 , reward = -34.0\n",
      "Episode 88: steps = 50 , reward = -50.0\n",
      "Episode 89: steps = 53 , reward = -53.0\n",
      "Episode 90: steps = 50 , reward = -50.0\n",
      "Episode 91: steps = 32 , reward = -32.0\n",
      "Episode 92: steps = 41 , reward = -41.0\n",
      "Episode 93: steps = 57 , reward = -57.0\n",
      "Episode 94: steps = 36 , reward = -36.0\n",
      "Episode 95: steps = 41 , reward = -41.0\n",
      "Episode 96: steps = 22 , reward = -22.0\n",
      "Episode 97: steps = 42 , reward = -42.0\n",
      "Episode 98: steps = 39 , reward = -39.0\n",
      "Episode 99: steps = 53 , reward = -53.0\n",
      "Episode 100: steps = 39 , reward = -39.0\n",
      "Episode 101: steps = 31 , reward = -31.0\n",
      "Episode 102: steps = 27 , reward = -27.0\n",
      "Episode 103: steps = 33 , reward = -33.0\n",
      "Episode 104: steps = 44 , reward = -44.0\n",
      "Episode 105: steps = 25 , reward = -25.0\n",
      "Episode 106: steps = 70 , reward = -169.0\n",
      "Episode 107: steps = 35 , reward = -35.0\n",
      "Episode 108: steps = 52 , reward = -52.0\n",
      "Episode 109: steps = 36 , reward = -36.0\n",
      "Episode 110: steps = 41 , reward = -41.0\n",
      "Episode 111: steps = 33 , reward = -33.0\n",
      "Episode 112: steps = 25 , reward = -25.0\n",
      "Episode 113: steps = 35 , reward = -35.0\n",
      "Episode 114: steps = 25 , reward = -25.0\n",
      "Episode 115: steps = 31 , reward = -31.0\n",
      "Episode 116: steps = 39 , reward = -39.0\n",
      "Episode 117: steps = 27 , reward = -27.0\n",
      "Episode 118: steps = 45 , reward = -45.0\n",
      "Episode 119: steps = 25 , reward = -25.0\n",
      "Episode 120: steps = 23 , reward = -23.0\n",
      "Episode 121: steps = 47 , reward = -47.0\n",
      "Episode 122: steps = 27 , reward = -27.0\n",
      "Episode 123: steps = 17 , reward = -17.0\n",
      "Episode 124: steps = 44 , reward = -44.0\n",
      "Episode 125: steps = 29 , reward = -29.0\n",
      "Episode 126: steps = 21 , reward = -21.0\n",
      "Episode 127: steps = 38 , reward = -38.0\n",
      "Episode 128: steps = 45 , reward = -45.0\n",
      "Episode 129: steps = 23 , reward = -23.0\n",
      "Episode 130: steps = 45 , reward = -45.0\n",
      "Episode 131: steps = 23 , reward = -23.0\n",
      "Episode 132: steps = 23 , reward = -23.0\n",
      "Episode 133: steps = 23 , reward = -23.0\n",
      "Episode 134: steps = 21 , reward = -21.0\n",
      "Episode 135: steps = 49 , reward = -148.0\n",
      "Episode 136: steps = 52 , reward = -52.0\n",
      "Episode 137: steps = 32 , reward = -32.0\n",
      "Episode 138: steps = 52 , reward = -151.0\n",
      "Episode 139: steps = 23 , reward = -23.0\n",
      "Episode 140: steps = 25 , reward = -25.0\n",
      "Episode 141: steps = 35 , reward = -35.0\n",
      "Episode 142: steps = 77 , reward = -176.0\n",
      "Episode 143: steps = 43 , reward = -43.0\n",
      "Episode 144: steps = 21 , reward = -21.0\n",
      "Episode 145: steps = 34 , reward = -133.0\n",
      "Episode 146: steps = 29 , reward = -29.0\n",
      "Episode 147: steps = 36 , reward = -36.0\n",
      "Episode 148: steps = 35 , reward = -35.0\n",
      "Episode 149: steps = 54 , reward = -153.0\n",
      "Episode 150: steps = 41 , reward = -140.0\n",
      "Episode 151: steps = 37 , reward = -37.0\n",
      "Episode 152: steps = 30 , reward = -30.0\n",
      "Episode 153: steps = 27 , reward = -27.0\n",
      "Episode 154: steps = 27 , reward = -27.0\n",
      "Episode 155: steps = 64 , reward = -262.0\n",
      "Episode 156: steps = 19 , reward = -19.0\n",
      "Episode 157: steps = 38 , reward = -38.0\n",
      "Episode 158: steps = 31 , reward = -31.0\n",
      "Episode 159: steps = 23 , reward = -23.0\n",
      "Episode 160: steps = 17 , reward = -17.0\n",
      "Episode 161: steps = 28 , reward = -28.0\n",
      "Episode 162: steps = 34 , reward = -34.0\n",
      "Episode 163: steps = 24 , reward = -24.0\n",
      "Episode 164: steps = 28 , reward = -28.0\n",
      "Episode 165: steps = 17 , reward = -17.0\n",
      "Episode 166: steps = 19 , reward = -19.0\n",
      "Episode 167: steps = 32 , reward = -32.0\n",
      "Episode 168: steps = 25 , reward = -25.0\n",
      "Episode 169: steps = 33 , reward = -33.0\n",
      "Episode 170: steps = 32 , reward = -32.0\n",
      "Episode 171: steps = 30 , reward = -30.0\n",
      "Episode 172: steps = 25 , reward = -25.0\n",
      "Episode 173: steps = 22 , reward = -22.0\n",
      "Episode 174: steps = 29 , reward = -29.0\n",
      "Episode 175: steps = 32 , reward = -32.0\n",
      "Episode 176: steps = 27 , reward = -27.0\n",
      "Episode 177: steps = 19 , reward = -19.0\n",
      "Episode 178: steps = 36 , reward = -36.0\n",
      "Episode 179: steps = 23 , reward = -23.0\n",
      "Episode 180: steps = 23 , reward = -23.0\n",
      "Episode 181: steps = 24 , reward = -24.0\n",
      "Episode 182: steps = 17 , reward = -17.0\n",
      "Episode 183: steps = 32 , reward = -32.0\n",
      "Episode 184: steps = 27 , reward = -27.0\n",
      "Episode 185: steps = 21 , reward = -21.0\n",
      "Episode 186: steps = 27 , reward = -27.0\n",
      "Episode 187: steps = 21 , reward = -21.0\n",
      "Episode 188: steps = 27 , reward = -27.0\n",
      "Episode 189: steps = 21 , reward = -21.0\n",
      "Episode 190: steps = 21 , reward = -21.0\n",
      "Episode 191: steps = 27 , reward = -27.0\n",
      "Episode 192: steps = 21 , reward = -21.0\n",
      "Episode 193: steps = 23 , reward = -23.0\n",
      "Episode 194: steps = 33 , reward = -33.0\n",
      "Episode 195: steps = 57 , reward = -156.0\n",
      "Episode 196: steps = 26 , reward = -26.0\n",
      "Episode 197: steps = 25 , reward = -25.0\n",
      "Episode 198: steps = 25 , reward = -25.0\n",
      "Episode 199: steps = 22 , reward = -22.0\n",
      "Episode 200: steps = 17 , reward = -17.0\n",
      "Episode 201: steps = 25 , reward = -25.0\n",
      "Episode 202: steps = 31 , reward = -31.0\n",
      "Episode 203: steps = 26 , reward = -26.0\n",
      "Episode 204: steps = 28 , reward = -28.0\n",
      "Episode 205: steps = 36 , reward = -135.0\n",
      "Episode 206: steps = 27 , reward = -27.0\n",
      "Episode 207: steps = 20 , reward = -20.0\n",
      "Episode 208: steps = 41 , reward = -41.0\n",
      "Episode 209: steps = 21 , reward = -21.0\n",
      "Episode 210: steps = 42 , reward = -141.0\n",
      "Episode 211: steps = 29 , reward = -29.0\n",
      "Episode 212: steps = 34 , reward = -34.0\n",
      "Episode 213: steps = 21 , reward = -21.0\n",
      "Episode 214: steps = 17 , reward = -17.0\n",
      "Episode 215: steps = 23 , reward = -23.0\n",
      "Episode 216: steps = 25 , reward = -25.0\n",
      "Episode 217: steps = 29 , reward = -29.0\n",
      "Episode 218: steps = 27 , reward = -27.0\n",
      "Episode 219: steps = 28 , reward = -28.0\n",
      "Episode 220: steps = 21 , reward = -21.0\n",
      "Episode 221: steps = 22 , reward = -22.0\n",
      "Episode 222: steps = 27 , reward = -27.0\n",
      "Episode 223: steps = 31 , reward = -31.0\n",
      "Episode 224: steps = 21 , reward = -21.0\n",
      "Episode 225: steps = 24 , reward = -24.0\n",
      "Episode 226: steps = 21 , reward = -21.0\n",
      "Episode 227: steps = 33 , reward = -132.0\n",
      "Episode 228: steps = 21 , reward = -21.0\n",
      "Episode 229: steps = 19 , reward = -19.0\n",
      "Episode 230: steps = 28 , reward = -28.0\n",
      "Episode 231: steps = 20 , reward = -119.0\n",
      "Episode 232: steps = 28 , reward = -28.0\n",
      "Episode 233: steps = 27 , reward = -27.0\n",
      "Episode 234: steps = 22 , reward = -22.0\n",
      "Episode 235: steps = 17 , reward = -17.0\n",
      "Episode 236: steps = 30 , reward = -30.0\n",
      "Episode 237: steps = 18 , reward = -18.0\n",
      "Episode 238: steps = 17 , reward = -17.0\n",
      "Episode 239: steps = 32 , reward = -131.0\n",
      "Episode 240: steps = 24 , reward = -24.0\n",
      "Episode 241: steps = 25 , reward = -25.0\n",
      "Episode 242: steps = 25 , reward = -25.0\n",
      "Episode 243: steps = 23 , reward = -23.0\n",
      "Episode 244: steps = 31 , reward = -31.0\n",
      "Episode 245: steps = 28 , reward = -28.0\n",
      "Episode 246: steps = 23 , reward = -23.0\n",
      "Episode 247: steps = 19 , reward = -19.0\n",
      "Episode 248: steps = 34 , reward = -34.0\n",
      "Episode 249: steps = 25 , reward = -25.0\n",
      "Episode 250: steps = 27 , reward = -27.0\n",
      "Episode 251: steps = 29 , reward = -29.0\n",
      "Episode 252: steps = 20 , reward = -20.0\n",
      "Episode 253: steps = 23 , reward = -23.0\n",
      "Episode 254: steps = 19 , reward = -19.0\n",
      "Episode 255: steps = 27 , reward = -27.0\n",
      "Episode 256: steps = 29 , reward = -29.0\n",
      "Episode 257: steps = 23 , reward = -23.0\n",
      "Episode 258: steps = 19 , reward = -19.0\n",
      "Episode 259: steps = 23 , reward = -23.0\n",
      "Episode 260: steps = 23 , reward = -23.0\n",
      "Episode 261: steps = 27 , reward = -126.0\n",
      "Episode 262: steps = 19 , reward = -19.0\n",
      "Episode 263: steps = 26 , reward = -26.0\n",
      "Episode 264: steps = 31 , reward = -31.0\n",
      "Episode 265: steps = 19 , reward = -19.0\n",
      "Episode 266: steps = 34 , reward = -133.0\n",
      "Episode 267: steps = 19 , reward = -19.0\n",
      "Episode 268: steps = 25 , reward = -25.0\n",
      "Episode 269: steps = 23 , reward = -23.0\n",
      "Episode 270: steps = 33 , reward = -33.0\n",
      "Episode 271: steps = 17 , reward = -17.0\n",
      "Episode 272: steps = 21 , reward = -21.0\n",
      "Episode 273: steps = 15 , reward = -15.0\n",
      "Episode 274: steps = 19 , reward = -19.0\n",
      "Episode 275: steps = 15 , reward = -15.0\n",
      "Episode 276: steps = 15 , reward = -15.0\n",
      "Episode 277: steps = 18 , reward = -117.0\n",
      "Episode 278: steps = 29 , reward = -29.0\n",
      "Episode 279: steps = 23 , reward = -23.0\n",
      "Episode 280: steps = 21 , reward = -21.0\n",
      "Episode 281: steps = 19 , reward = -19.0\n",
      "Episode 282: steps = 40 , reward = -139.0\n",
      "Episode 283: steps = 37 , reward = -136.0\n",
      "Episode 284: steps = 33 , reward = -33.0\n",
      "Episode 285: steps = 21 , reward = -21.0\n",
      "Episode 286: steps = 19 , reward = -19.0\n",
      "Episode 287: steps = 15 , reward = -15.0\n",
      "Episode 288: steps = 17 , reward = -17.0\n",
      "Episode 289: steps = 30 , reward = -30.0\n",
      "Episode 290: steps = 27 , reward = -27.0\n",
      "Episode 291: steps = 17 , reward = -17.0\n",
      "Episode 292: steps = 24 , reward = -24.0\n",
      "Episode 293: steps = 17 , reward = -17.0\n",
      "Episode 294: steps = 17 , reward = -17.0\n",
      "Episode 295: steps = 19 , reward = -19.0\n",
      "Episode 296: steps = 20 , reward = -20.0\n",
      "Episode 297: steps = 17 , reward = -17.0\n",
      "Episode 298: steps = 17 , reward = -17.0\n",
      "Episode 299: steps = 21 , reward = -21.0\n",
      "Episode 300: steps = 19 , reward = -19.0\n",
      "Episode 301: steps = 15 , reward = -15.0\n",
      "Episode 302: steps = 21 , reward = -21.0\n",
      "Episode 303: steps = 15 , reward = -15.0\n",
      "Episode 304: steps = 16 , reward = -16.0\n",
      "Episode 305: steps = 17 , reward = -17.0\n",
      "Episode 306: steps = 21 , reward = -21.0\n",
      "Episode 307: steps = 17 , reward = -17.0\n",
      "Episode 308: steps = 15 , reward = -15.0\n",
      "Episode 309: steps = 21 , reward = -21.0\n",
      "Episode 310: steps = 17 , reward = -17.0\n",
      "Episode 311: steps = 15 , reward = -15.0\n",
      "Episode 312: steps = 17 , reward = -17.0\n",
      "Episode 313: steps = 33 , reward = -33.0\n",
      "Episode 314: steps = 18 , reward = -18.0\n",
      "Episode 315: steps = 15 , reward = -15.0\n",
      "Episode 316: steps = 22 , reward = -22.0\n",
      "Episode 317: steps = 17 , reward = -17.0\n",
      "Episode 318: steps = 17 , reward = -17.0\n",
      "Episode 319: steps = 17 , reward = -17.0\n",
      "Episode 320: steps = 19 , reward = -19.0\n",
      "Episode 321: steps = 17 , reward = -17.0\n",
      "Episode 322: steps = 15 , reward = -15.0\n",
      "Episode 323: steps = 28 , reward = -28.0\n",
      "Episode 324: steps = 23 , reward = -23.0\n",
      "Episode 325: steps = 19 , reward = -19.0\n",
      "Episode 326: steps = 21 , reward = -21.0\n",
      "Episode 327: steps = 18 , reward = -18.0\n",
      "Episode 328: steps = 21 , reward = -120.0\n",
      "Episode 329: steps = 26 , reward = -26.0\n",
      "Episode 330: steps = 23 , reward = -23.0\n",
      "Episode 331: steps = 20 , reward = -20.0\n",
      "Episode 332: steps = 33 , reward = -33.0\n",
      "Episode 333: steps = 23 , reward = -23.0\n",
      "Episode 334: steps = 29 , reward = -29.0\n",
      "Episode 335: steps = 29 , reward = -29.0\n",
      "Episode 336: steps = 23 , reward = -23.0\n",
      "Episode 337: steps = 23 , reward = -23.0\n",
      "Episode 338: steps = 27 , reward = -27.0\n",
      "Episode 339: steps = 21 , reward = -21.0\n",
      "Episode 340: steps = 24 , reward = -24.0\n",
      "Episode 341: steps = 19 , reward = -19.0\n",
      "Episode 342: steps = 19 , reward = -19.0\n",
      "Episode 343: steps = 23 , reward = -23.0\n",
      "Episode 344: steps = 19 , reward = -19.0\n",
      "Episode 345: steps = 48 , reward = -48.0\n",
      "Episode 346: steps = 19 , reward = -19.0\n",
      "Episode 347: steps = 26 , reward = -26.0\n",
      "Episode 348: steps = 17 , reward = -17.0\n",
      "Episode 349: steps = 27 , reward = -27.0\n",
      "Episode 350: steps = 17 , reward = -17.0\n",
      "Episode 351: steps = 20 , reward = -20.0\n",
      "Episode 352: steps = 17 , reward = -17.0\n",
      "Episode 353: steps = 26 , reward = -26.0\n",
      "Episode 354: steps = 34 , reward = -34.0\n",
      "Episode 355: steps = 18 , reward = -18.0\n",
      "Episode 356: steps = 42 , reward = -141.0\n",
      "Episode 357: steps = 35 , reward = -134.0\n",
      "Episode 358: steps = 26 , reward = -26.0\n",
      "Episode 359: steps = 30 , reward = -30.0\n",
      "Episode 360: steps = 17 , reward = -17.0\n",
      "Episode 361: steps = 20 , reward = -20.0\n",
      "Episode 362: steps = 33 , reward = -33.0\n",
      "Episode 363: steps = 27 , reward = -27.0\n",
      "Episode 364: steps = 30 , reward = -30.0\n",
      "Episode 365: steps = 33 , reward = -231.0\n",
      "Episode 366: steps = 19 , reward = -19.0\n",
      "Episode 367: steps = 34 , reward = -34.0\n",
      "Episode 368: steps = 19 , reward = -19.0\n",
      "Episode 369: steps = 19 , reward = -19.0\n",
      "Episode 370: steps = 27 , reward = -27.0\n",
      "Episode 371: steps = 23 , reward = -23.0\n",
      "Episode 372: steps = 42 , reward = -141.0\n",
      "Episode 373: steps = 26 , reward = -26.0\n",
      "Episode 374: steps = 19 , reward = -19.0\n",
      "Episode 375: steps = 24 , reward = -123.0\n",
      "Episode 376: steps = 20 , reward = -20.0\n",
      "Episode 377: steps = 20 , reward = -20.0\n",
      "Episode 378: steps = 23 , reward = -23.0\n",
      "Episode 379: steps = 23 , reward = -23.0\n",
      "Episode 380: steps = 17 , reward = -17.0\n",
      "Episode 381: steps = 17 , reward = -17.0\n",
      "Episode 382: steps = 27 , reward = -27.0\n",
      "Episode 383: steps = 27 , reward = -27.0\n",
      "Episode 384: steps = 19 , reward = -19.0\n",
      "Episode 385: steps = 22 , reward = -121.0\n",
      "Episode 386: steps = 21 , reward = -21.0\n",
      "Episode 387: steps = 27 , reward = -27.0\n",
      "Episode 388: steps = 17 , reward = -17.0\n",
      "Episode 389: steps = 21 , reward = -21.0\n",
      "Episode 390: steps = 25 , reward = -25.0\n",
      "Episode 391: steps = 18 , reward = -18.0\n",
      "Episode 392: steps = 19 , reward = -19.0\n",
      "Episode 393: steps = 25 , reward = -25.0\n",
      "Episode 394: steps = 21 , reward = -21.0\n",
      "Episode 395: steps = 22 , reward = -22.0\n",
      "Episode 396: steps = 21 , reward = -21.0\n",
      "Episode 397: steps = 17 , reward = -17.0\n",
      "Episode 398: steps = 23 , reward = -23.0\n",
      "Episode 399: steps = 23 , reward = -23.0\n",
      "Episode 400: steps = 19 , reward = -19.0\n",
      "Episode 401: steps = 19 , reward = -19.0\n",
      "Episode 402: steps = 21 , reward = -21.0\n",
      "Episode 403: steps = 21 , reward = -21.0\n",
      "Episode 404: steps = 21 , reward = -21.0\n",
      "Episode 405: steps = 23 , reward = -23.0\n",
      "Episode 406: steps = 19 , reward = -19.0\n",
      "Episode 407: steps = 18 , reward = -18.0\n",
      "Episode 408: steps = 17 , reward = -17.0\n",
      "Episode 409: steps = 21 , reward = -21.0\n",
      "Episode 410: steps = 29 , reward = -29.0\n",
      "Episode 411: steps = 23 , reward = -23.0\n",
      "Episode 412: steps = 23 , reward = -23.0\n",
      "Episode 413: steps = 17 , reward = -17.0\n",
      "Episode 414: steps = 22 , reward = -22.0\n",
      "Episode 415: steps = 19 , reward = -19.0\n",
      "Episode 416: steps = 18 , reward = -18.0\n",
      "Episode 417: steps = 19 , reward = -19.0\n",
      "Episode 418: steps = 21 , reward = -21.0\n",
      "Episode 419: steps = 17 , reward = -17.0\n",
      "Episode 420: steps = 17 , reward = -17.0\n",
      "Episode 421: steps = 17 , reward = -17.0\n",
      "Episode 422: steps = 33 , reward = -33.0\n",
      "Episode 423: steps = 47 , reward = -47.0\n",
      "Episode 424: steps = 21 , reward = -21.0\n",
      "Episode 425: steps = 20 , reward = -20.0\n",
      "Episode 426: steps = 21 , reward = -21.0\n",
      "Episode 427: steps = 23 , reward = -23.0\n",
      "Episode 428: steps = 17 , reward = -17.0\n",
      "Episode 429: steps = 17 , reward = -17.0\n",
      "Episode 430: steps = 17 , reward = -17.0\n",
      "Episode 431: steps = 15 , reward = -15.0\n",
      "Episode 432: steps = 15 , reward = -15.0\n",
      "Episode 433: steps = 37 , reward = -37.0\n",
      "Episode 434: steps = 19 , reward = -19.0\n",
      "Episode 435: steps = 19 , reward = -19.0\n",
      "Episode 436: steps = 17 , reward = -17.0\n",
      "Episode 437: steps = 15 , reward = -15.0\n",
      "Episode 438: steps = 17 , reward = -17.0\n",
      "Episode 439: steps = 17 , reward = -17.0\n",
      "Episode 440: steps = 15 , reward = -15.0\n",
      "Episode 441: steps = 15 , reward = -15.0\n",
      "Episode 442: steps = 15 , reward = -15.0\n",
      "Episode 443: steps = 19 , reward = -19.0\n",
      "Episode 444: steps = 15 , reward = -15.0\n",
      "Episode 445: steps = 19 , reward = -19.0\n",
      "Episode 446: steps = 19 , reward = -19.0\n",
      "Episode 447: steps = 19 , reward = -19.0\n",
      "Episode 448: steps = 19 , reward = -19.0\n",
      "Episode 449: steps = 21 , reward = -21.0\n",
      "Episode 450: steps = 22 , reward = -121.0\n",
      "Episode 451: steps = 19 , reward = -19.0\n",
      "Episode 452: steps = 16 , reward = -16.0\n",
      "Episode 453: steps = 17 , reward = -17.0\n",
      "Episode 454: steps = 15 , reward = -15.0\n",
      "Episode 455: steps = 17 , reward = -17.0\n",
      "Episode 456: steps = 17 , reward = -17.0\n",
      "Episode 457: steps = 19 , reward = -19.0\n",
      "Episode 458: steps = 20 , reward = -20.0\n",
      "Episode 459: steps = 18 , reward = -18.0\n",
      "Episode 460: steps = 19 , reward = -19.0\n",
      "Episode 461: steps = 19 , reward = -19.0\n",
      "Episode 462: steps = 18 , reward = -18.0\n",
      "Episode 463: steps = 25 , reward = -25.0\n",
      "Episode 464: steps = 17 , reward = -17.0\n",
      "Episode 465: steps = 21 , reward = -21.0\n",
      "Episode 466: steps = 21 , reward = -21.0\n",
      "Episode 467: steps = 18 , reward = -18.0\n",
      "Episode 468: steps = 22 , reward = -121.0\n",
      "Episode 469: steps = 19 , reward = -19.0\n",
      "Episode 470: steps = 30 , reward = -30.0\n",
      "Episode 471: steps = 19 , reward = -19.0\n",
      "Episode 472: steps = 19 , reward = -19.0\n",
      "Episode 473: steps = 17 , reward = -17.0\n",
      "Episode 474: steps = 19 , reward = -19.0\n",
      "Episode 475: steps = 20 , reward = -20.0\n",
      "Episode 476: steps = 19 , reward = -19.0\n",
      "Episode 477: steps = 19 , reward = -19.0\n",
      "Episode 478: steps = 21 , reward = -21.0\n",
      "Episode 479: steps = 17 , reward = -17.0\n",
      "Episode 480: steps = 29 , reward = -29.0\n",
      "Episode 481: steps = 19 , reward = -19.0\n",
      "Episode 482: steps = 25 , reward = -124.0\n",
      "Episode 483: steps = 18 , reward = -18.0\n",
      "Episode 484: steps = 19 , reward = -19.0\n",
      "Episode 485: steps = 19 , reward = -19.0\n",
      "Episode 486: steps = 18 , reward = -18.0\n",
      "Episode 487: steps = 17 , reward = -17.0\n",
      "Episode 488: steps = 19 , reward = -19.0\n",
      "Episode 489: steps = 21 , reward = -21.0\n",
      "Episode 490: steps = 19 , reward = -19.0\n",
      "Episode 491: steps = 21 , reward = -120.0\n",
      "Episode 492: steps = 17 , reward = -17.0\n",
      "Episode 493: steps = 21 , reward = -21.0\n",
      "Episode 494: steps = 19 , reward = -19.0\n",
      "Episode 495: steps = 20 , reward = -20.0\n",
      "Episode 496: steps = 26 , reward = -26.0\n",
      "Episode 497: steps = 22 , reward = -22.0\n",
      "Episode 498: steps = 19 , reward = -19.0\n",
      "Episode 499: steps = 21 , reward = -21.0\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "x  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "x  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "x  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  x  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  x  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  x  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  x  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  x  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  x  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  o  x  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  o  o  x  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  o  o  o  x  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  x  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  x  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  x\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  x\n",
      "o  C  C  C  C  C  C  C  C  C  C  T\n",
      "\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  o  o  o  o  o  o  o  o  o  o  o\n",
      "o  C  C  C  C  C  C  C  C  C  C  x\n",
      "\n",
      "test reward = -17.0\n",
      "./qlist.npy saved.\n"
     ]
    }
   ],
   "source": [
    "# 使用本地文件创建悬崖环境\n",
    "env = CliffWalkingEnv()  # 0 up, 1 right, 2 down, 3 left\n",
    "\n",
    "# 创建一个agent实例，输入超参数\n",
    "agent = SarsaAgent(\n",
    "        obs_n=env.observation_space.n,\n",
    "        act_n=env.action_space.n,\n",
    "        learning_rate=0.1,\n",
    "        gamma=0.9,\n",
    "        e_greed=0.1)\n",
    "\n",
    "\n",
    "# 训练500个episode，打印每个episode的分数\n",
    "for episode in range(500):\n",
    "    ep_reward, ep_steps = run_episode(env, agent, False)\n",
    "    print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, ep_reward))\n",
    "\n",
    "# 全部训练结束，查看算法效果\n",
    "test_reward = test_episode(env, agent)\n",
    "print('test reward = %.1f' % (test_reward))\n",
    "agent.save()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "关于reward调整的问题。通过实验发现，如果将踏入cliff的reward设置为很大的话，比如100万，会出现原地罚站的现象，一直在一个地方无法前进，具体表现为很难，甚至不能得到收敛。但是如果将reward从-100进行适当的降低，比如-200或者-500，也没有能得到更好的效果，reward设置为-100，最后的test reward为-15，reward设置为-200的时候，test reward也为-15，设置为-500的时候，test reward为-17.同样的，提高reward为-80，-50也没有显著的效果。"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "068f746a7a87e9f5cf75d2aabdd7dc03cdf29652afb2aef083074affc5c347cd"
  },
  "kernelspec": {
   "display_name": "Python 3.7.10 ('branchnet')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
